# load packages, installing if missing
if (!require(librarian)){
  install.packages("librarian")
  library(librarian)
}
librarian::shelf(
  tidyverse, dismo, DT, here, htmltools, leaflet, mapview, raster, rgbif, 
  rgdal, rJava, sdmpredictors, sf, spocc, geojsonio, GGally, mgcv, maptools,
  caret,       # m: modeling framework
  pdp,         # X: partial dependence plots
  ranger,      # m: random forest modeling
  rpart,       # m: recursive partition modeling
  rpart.plot,  # m: recursive partition plotting
  rsample,     # d: split train/test data
  skimr,       # d: skim summarize data table
  vip         # X: variable importance
  )
select <- dplyr::select # overwrite raster::select

options(readr.show_col_type = FALSE, scipen = 999)

# set random seed for reproducibility
set.seed(42)

# graphical theme
ggplot2::theme_set(ggplot2::theme_light())

# directory to store data
dir_data <- here("data/sdm")
dir.create(dir_data, showWarnings = F, recursive = TRUE)

# logical, redo full calculations within if statements or not
redo <- FALSE

1 Species Distribution Modeling: Explore

1.1 Get Species Observations

Species: Red-tailed Hawk (Buteo jamaicensis) Source: Audubon Field Guide Total observations after creating bounding box around habitat area and removing duplicate geometries.

obs_csv <- file.path(dir_data, "obs.csv")
obs_geo <- file.path(dir_data, "obs.geojson")

if (!file.exists(obs_geo) | redo){
  # get species occurrence data from GBIF with coordinates
  (res <- spocc::occ(
    query = 'Buteo jamaicensis',
    from = 'gbif',
    has_coords = T,
    limit = 10000))
  
  # extract data frame from result
  df <- res$gbif$data[[1]] 
  readr::write_csv(df, obs_csv)

  # convert to points of observation from lon/lat columns in data frame
  obs <- df %>% 
    # limit observations to bounding box around north and central america
    filter(between(longitude, -167.593385, -51.171266),
           between(latitude, 5.645215, 71.374349)) %>% 
    sf::st_as_sf(
      coords = c("longitude", "latitude"),
      crs = st_crs(4326)) %>% 
    select(prov, key) %>%  # save space (joinable from obs_csv)
    distinct(geometry, .keep_all = TRUE) # remove duplicate geometries
  
  sf::write_sf(obs, obs_geo, delete_dsn = T)
}

obs <- sf::read_sf(obs_geo)
nrow(obs) # number of rows
## [1] 9376

1.1.1 Map of species observation locations

# show points on map
mapview::mapview(obs, map.types = "Esri.WorldStreetMap")

1.1.2 Questions:

Question 1: There were a total of 7,004,451 observations in GBIF for the Red-tailed hawk (Buteo jamaicensis) as of 2022-01-24.
Question 2: There was one point from the original dataset that was in the ocean near Europe. To fix this issues, I created a bounding box around the parts of North and Central America and used this bbox to exclude any locations outside of the Red-tailed Hawks habitat.

1.2 Get environmental data

1.2.1 Explore environmental data options

dir_env <- file.path(dir_data, "env")

# set a default data directory
options(sdmpredictors_datadir = dir_env)

# choosing terrestrial
env_datasets <- sdmpredictors::list_datasets(terrestrial = TRUE, marine = FALSE)

# show table of datasets
env_datasets %>% 
  select(dataset_code, description, citation) %>% 
  DT::datatable()
# choose datasets for a vector
env_datasets_vec <- c("WorldClim", "ENVIREM")

# get layers
env_layers <- sdmpredictors::list_layers(env_datasets_vec)
DT::datatable(env_layers)

1.2.2 Choose environmental data layers

# choose layers after some inspection and perhaps consulting literature
env_layers_vec <- c("WC_alt", # altitude
                    "WC_bio1", # annual mean temperature
                    "WC_bio2", # mean diurnal temperature range
                    "ER_tri", # terrain roughness
                    "WC_bio12") # annual precipitation

# get layers
env_stack <- load_layers(env_layers_vec)

# interactive plot layers, hiding all but first (select others)
# mapview(env_stack, hide = T) # makes the html too big for Github
plot(env_stack, nc = 2)

1.2.3 Questions:

Question 3: The environmental layers that I choose include: altitude, annual mean temperature, mean diurnal temperature range, terrain roughness, and annual precipitation. In the literature I found, other studies used: maximum and minimum temperatures, precipitation, solar radiation, wind speed, and cloud cover as abiotic factors. I felt that the first two variables from the literature were covered by annual mean temperature, mean diurnal temperature range, and annual precipitation. The other variables were not present in the available layers, so I did not include them.

1.2.4 Raster plot of environmental rasters clipped to species range

obs_hull_geo  <- file.path(dir_data, "obs_hull.geojson")
env_stack_grd <- file.path(dir_data, "env_stack.grd")

if (!file.exists(obs_hull_geo) | redo){
  # make convex hull around points of observation
  obs_hull <- sf::st_convex_hull(st_union(obs))
  
  # save obs hull
  write_sf(obs_hull, obs_hull_geo)
}
obs_hull <- read_sf(obs_hull_geo)

# show points on map
mapview(
  list(obs, obs_hull))
if (!file.exists(env_stack_grd) | redo){
  obs_hull_sp <- sf::as_Spatial(obs_hull)
  env_stack <- raster::mask(env_stack, obs_hull_sp) %>% 
    raster::crop(extent(obs_hull_sp))
  writeRaster(env_stack, env_stack_grd, overwrite = T)  
}
env_stack <- stack(env_stack_grd)

# show map
plot(env_stack, nc=2)

1.2.5 Pseudo-Absence

absence_geo <- file.path(dir_data, "absence.geojson")
pts_geo     <- file.path(dir_data, "pts.geojson")
pts_env_csv <- file.path(dir_data, "pts_env.csv")

if (!file.exists(absence_geo) | redo){
  # get raster count of observations
  r_obs <- rasterize(
    sf::as_Spatial(obs), env_stack[[1]], field=1, fun='count')
  
  # create mask for 
  r_mask <- mask(env_stack[[1]] > -Inf, r_obs, inverse=T)
  
  # generate random points inside mask
  absence <- dismo::randomPoints(r_mask, nrow(obs)) %>% 
    as_tibble() %>% 
    st_as_sf(coords = c("x", "y"), crs = 4326)
  
  write_sf(absence, absence_geo, delete_dsn = T)
}
absence <- read_sf(absence_geo)

# show map of presence, ie obs, and absence
mapview(obs, col.regions = "green") + 
  mapview(absence, col.regions = "gray")
if (!file.exists(pts_env_csv) | redo){

  # combine presence and absence into single set of labeled points 
  pts <- rbind(
    obs %>% 
      mutate(present = 1) %>% 
      select(present, key),
    absence %>% mutate(present = 0, 
                       key = NA)) %>% 
    mutate(ID = 1:n()) %>% 
    relocate(ID)
  write_sf(pts, pts_geo, delete_dsn = TRUE)

  # extract raster values for points
  pts_env <- raster::extract(env_stack, as_Spatial(pts), df = TRUE) %>% 
    tibble() %>% 
    # join present and geometry columns to raster value results for points
    left_join(pts %>% select(ID, present), 
              by = "ID") %>% 
    relocate(present, .after = ID) %>% 
    # extract lon, lat as single columns
    mutate(lon = st_coordinates(geometry)[,1],
           lat = st_coordinates(geometry)[,2]) %>% 
    select(-geometry)
  write_csv(pts_env, pts_env_csv)
}
pts_env <- read_csv(pts_env_csv)

pts_env %>% 
  # show first 10 presence, last 10 absence
  slice(c(1:10, (nrow(pts_env)-9):nrow(pts_env))) %>% 
  DT::datatable(rownames = FALSE,
                options = list(dom = "t",
                               pageLength = 20))

1.3 Term Plots

pts_env %>% 
  select(-ID) %>% 
  mutate(present = factor(present)) %>% 
  pivot_longer(-present) %>% 
  ggplot() +
  geom_density(aes(x = value, fill = present)) + 
  scale_fill_manual(values = alpha(c("gray", "green"), 0.5)) +
  scale_x_continuous(expand=c(0,0)) +
  scale_y_continuous(expand=c(0,0)) +
  theme_bw() + 
  facet_wrap(~name, scales = "free") +
  theme(legend.position = c(1, 0),
        legend.justification = c(1, 0))

2 Species Distribution Modeling: Logistic Regression

2.1 Pairs Plots

GGally::ggpairs(
  select(pts_env, -ID),
  aes(color = factor(present), alpha = 0.5))

2.2 Logistic Regression

2.2.1 Setup Data

# setup model data
model_data <- pts_env %>% 
  select(-ID) %>%  # remove terms we don't want to model
  tidyr::drop_na() # drop rows with NA values
nrow(model_data)
## [1] 18689

2.2.2 Linear Model

# fit a linear model
mdl_lm <- lm(present ~ ., data = model_data) # . means all other columns (X)
summary(mdl_lm)
## 
## Call:
## lm(formula = present ~ ., data = model_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.11564 -0.41687  0.05479  0.41187  1.23357 
## 
## Coefficients:
##                Estimate  Std. Error t value             Pr(>|t|)    
## (Intercept)  0.53038786  0.07882251   6.729      0.0000000000176 ***
## WC_alt      -0.00011443  0.00001244  -9.196 < 0.0000000000000002 ***
## WC_bio1      0.02994807  0.00220208  13.600 < 0.0000000000000002 ***
## WC_bio2     -0.04723098  0.00215837 -21.883 < 0.0000000000000002 ***
## ER_tri      -0.00042966  0.00012862  -3.340             0.000838 ***
## WC_bio12    -0.00027108  0.00001039 -26.100 < 0.0000000000000002 ***
## lon         -0.00176759  0.00030868  -5.726      0.0000000104210 ***
## lat          0.00825249  0.00154596   5.338      0.0000000950219 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4516 on 18681 degrees of freedom
## Multiple R-squared:  0.1846, Adjusted R-squared:  0.1843 
## F-statistic: 604.2 on 7 and 18681 DF,  p-value: < 0.00000000000000022
y_predict_lm <- predict(mdl_lm, model_data, type = "response")
y_true <- pts_env$present

range(y_predict_lm)
## [1] -0.8936088  1.1156414
range(y_true)
## [1] 0 1

2.2.3 Generalized Linear Model

# fit a generalized linear model with a binomial logit link function
mdl_glm <- glm(present ~ ., 
               family = binomial(link = "logit"), 
               data = model_data)
summary(mdl_glm)
## 
## Call:
## glm(formula = present ~ ., family = binomial(link = "logit"), 
##     data = model_data)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.50204  -1.00911  -0.03428   0.99099   2.77627  
## 
## Coefficients:
##                Estimate  Std. Error z value             Pr(>|z|)    
## (Intercept)  0.06305959  0.40734341   0.155              0.87697    
## WC_alt      -0.00060899  0.00006371  -9.559 < 0.0000000000000002 ***
## WC_bio1      0.15429205  0.01131031  13.642 < 0.0000000000000002 ***
## WC_bio2     -0.23356351  0.01147043 -20.362 < 0.0000000000000002 ***
## ER_tri      -0.00200862  0.00066290  -3.030              0.00245 ** 
## WC_bio12    -0.00141805  0.00005651 -25.096 < 0.0000000000000002 ***
## lon         -0.00686829  0.00155792  -4.409        0.00001040251 ***
## lat          0.04786281  0.00804013   5.953        0.00000000263 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 25908  on 18688  degrees of freedom
## Residual deviance: 22035  on 18681  degrees of freedom
## AIC: 22051
## 
## Number of Fisher Scoring iterations: 4
y_predict_glm <- predict(mdl_glm, model_data, type = "response")

range(y_predict_glm)
## [1] 0.0005872994 0.9562871347

2.2.3.1 GLM Term Plots

termplot(mdl_glm, partial.resid = TRUE, se = TRUE, main = F)

2.2.4 Generalized Additive Model

# fit a generalized additive model with smooth predictors
mdl_gam <- mgcv::gam(
  formula = present ~ s(WC_alt) + s(WC_bio1) + 
    s(WC_bio2) + s(ER_tri) + s(WC_bio12) + s(lon) + s(lat), 
  family = binomial, data = model_data)

summary(mdl_gam)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## present ~ s(WC_alt) + s(WC_bio1) + s(WC_bio2) + s(ER_tri) + s(WC_bio12) + 
##     s(lon) + s(lat)
## 
## Parametric coefficients:
##             Estimate Std. Error z value  Pr(>|z|)    
## (Intercept)  -0.8549     0.2163  -3.953 0.0000772 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df Chi.sq             p-value    
## s(WC_alt)   8.309  8.840  337.4 <0.0000000000000002 ***
## s(WC_bio1)  8.972  8.999 1251.2 <0.0000000000000002 ***
## s(WC_bio2)  8.300  8.792  615.6 <0.0000000000000002 ***
## s(ER_tri)   8.821  8.969  121.9 <0.0000000000000002 ***
## s(WC_bio12) 5.944  6.680  618.5 <0.0000000000000002 ***
## s(lon)      8.817  8.984  598.5 <0.0000000000000002 ***
## s(lat)      8.819  8.990  295.4 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =   0.52   Deviance explained = 45.7%
## UBRE = -0.24159  Scale est. = 1         n = 18689

2.2.4.1 GAM Term Plots

plot(mdl_gam, scale = 0)

2.2.5 Questions

Question 4: There are two environment variables that seem to contribute most towards presence. For altitude, below about 250 ft of altitude there is a higher chance of presence with a high confidence range. Where there is a diurnal temperature range (bio2) between about 7 and 12 degree there is a higher chance of species presence. Additionally, between about -120 and -100 longitudinal degrees there is a higher chance of species presence.

2.2.6 Maxent (Maximum Entropy)

mdl_maxent_rds <- file.path(dir_data, "mdl_maxent.rds") # create new file path

# show version of maxent
if (!interactive())
  maxent()
## This is MaxEnt version 3.4.3
# plot environmental rasters
plot(env_stack, nc = 2)

2.2.6.1 Maxent variable contribution plot

# get presence-only observation points (maxent extracts raster values for you)
obs_sp <- sf::as_Spatial(obs) # maxent prefers sp::SpatialPoints over newer sf::sf class

# fit a maximum entropy model
if (!file.exists(mdl_maxent_rds) | redo){
  mdl <- maxent(env_stack, obs_sp)
  readr::write_rds(mdl, mdl_maxent_rds)
}
mdl_maxent <- read_rds(mdl_maxent_rds)

# plot variable contributions per predictor
plot(mdl_maxent)

2.2.6.2 Maxent term plots

# plot term plots
response(mdl_maxent)

2.2.6.3 Maxent prediction

y_predict_maxent <- predict(env_stack, mdl_maxent)

plot(y_predict_maxent, main = 'Maxent, raw prediction')
data(wrld_simpl, package = "maptools")
plot(wrld_simpl, add = TRUE, border='dark grey')

2.2.7 Questions

Question 4: There are four environment variables that seem to contribute most towards presence for the maxent model. The first two, altitude and bio2 were similar to the GAM results. For altitude, below about 500 ft of altitude there is a high chance of presence. Where there is a diurnal temperature range (bio2) between about 5 and 15 degree there is a high chance of species presence. The other two variables (mean annual temperature and annual precipitation) had values that seems to predict species presence that did not show up in the GAM results. When the mean annual temperature (bio1) is between 10 and 20 degrees there is a high chance of species presence. And when the annual precipitation is below 1,000 there is a high chance of species presence.

3 Species Distribution Modeling: Decision Trees

3.1 Set up data

decision_tree_data <- pts_env %>% 
  select(-ID) %>%                         # not used as a predictor x
  mutate(present = factor(present)) %>%   # categorical response
  na.omit()                               # drop rows with NA

skim(decision_tree_data)
Data summary
Name decision_tree_data
Number of rows 18689
Number of columns 8
_______________________
Column type frequency:
factor 1
numeric 7
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
present 0 1 FALSE 2 0: 9345, 1: 9344

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
WC_alt 0 1 558.07 638.36 -70.00 101.00 279.00 808.00 3738.00 ▇▂▁▁▁
WC_bio1 0 1 12.26 6.59 -5.60 7.70 12.30 17.30 29.10 ▂▅▇▇▂
WC_bio2 0 1 12.72 2.52 4.00 11.10 12.40 14.50 21.20 ▁▃▇▃▁
ER_tri 0 1 25.95 34.72 0.00 5.33 11.46 32.11 277.85 ▇▁▁▁▁
WC_bio12 0 1 787.55 452.51 52.00 438.00 753.00 1059.00 5648.00 ▇▂▁▁▁
lon 0 1 -100.69 16.70 -135.29 -117.08 -100.87 -86.96 -59.96 ▅▇▇▇▂
lat 0 1 38.06 8.95 9.12 33.09 38.55 43.79 60.62 ▁▂▇▆▂

3.2 Split data into training and testing

# create training set with 80% of full data
d_split  <- rsample::initial_split(decision_tree_data, 
                                   prop = 0.8, 
                                   strata = "present")
d_train  <- rsample::training(d_split)
# show number of rows present is 0 vs 1 (all data)
table(decision_tree_data$present)
## 
##    0    1 
## 9345 9344
# show number of rows present is 0 vs 1 (training data)
table(d_train$present)
## 
##    0    1 
## 7476 7475

3.3 Rpart models

3.3.1 Partition, depth=1

# run decision stump model
mdl_dt1 <- rpart(present ~ ., 
                 data = d_train,
                 control = list(cp = 0, 
                                minbucket = 5, 
                                maxdepth = 1))
mdl_dt1
## n= 14951 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 14951 7475 0 (0.50003344 0.49996656)  
##   2) WC_bio1< 6.35 2824  179 0 (0.93661473 0.06338527) *
##   3) WC_bio1>=6.35 12127 4831 1 (0.39836728 0.60163272) *
# plot tree 
par(mar = c(1, 1, 1, 1))
rpart.plot(mdl_dt1)

3.3.2 Partition, depth=default

# decision tree with defaults
mdl_dt2 <- rpart(present ~ ., data = d_train)
mdl_dt2
## n= 14951 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 14951 7475 0 (0.50003344 0.49996656)  
##    2) WC_bio1< 6.35 2824  179 0 (0.93661473 0.06338527) *
##    3) WC_bio1>=6.35 12127 4831 1 (0.39836728 0.60163272)  
##      6) lon>=-116.1268 8570 4214 0 (0.50828471 0.49171529)  
##       12) WC_bio2>=12.55 4196 1400 0 (0.66634890 0.33365110) *
##       13) WC_bio2< 12.55 4374 1560 1 (0.35665295 0.64334705)  
##         26) lat< 23.14866 580   30 0 (0.94827586 0.05172414) *
##         27) lat>=23.14866 3794 1010 1 (0.26620980 0.73379020) *
##      7) lon< -116.1268 3557  475 1 (0.13353950 0.86646050) *
rpart.plot(mdl_dt2)

# plot complexity parameter
plotcp(mdl_dt2)

# rpart cross validation results
mdl_dt2$cptable
##           CP nsplit rel error    xerror        xstd
## 1 0.32976589      0 1.0000000 1.0248829 0.008176363
## 2 0.09337793      1 0.6702341 0.6702341 0.007721249
## 3 0.06956522      3 0.4834783 0.4880268 0.007025512
## 4 0.01000000      4 0.4139130 0.4172575 0.006646461

3.3.3 Complexity parameter plot

# caret cross validation results
mdl_caret <- train(present ~ .,
                   data = d_train,
                   method = "rpart",
                   trControl  = trainControl(method = "cv", number = 10),
                   tuneLength = 20)

ggplot(mdl_caret)

3.3.4 Questions

Question 5: Based on the complexity plot threshold, what size of tree is recommended?

3.3.5 Variable importance plot

vip(mdl_caret, num_features = 40, bar = FALSE)

3.3.6 Questions

Question 6: what are the top 3 most important variables of your model?

3.3.7 Partial dependenc plots

# Construct partial dependence plots
p1 <- partial(mdl_caret, pred.var = "lat") %>% autoplot()
p2 <- partial(mdl_caret, pred.var = "WC_bio2") %>% autoplot()
p3 <- partial(mdl_caret, pred.var = c("lat", "WC_bio2")) %>%
  plotPartial(levelplot = FALSE, zlab = "yhat", drape = TRUE,
              colorkey = TRUE, screen = list(z = -20, x = -60))

# Display plots side by side
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)

3.4 Random Forests

3.4.1 Fit

# number of features
n_features <- length(setdiff(names(d_train), "present"))

# fit a default random forest model
mdl_rf <- ranger(present ~ ., data = d_train)

# get out of the box RMSE
(default_rmse <- sqrt(mdl_rf$prediction.error))
## [1] 0.2921395

3.4.2 Variable importance plot

# re-run model with impurity-based variable importance
mdl_impurity <- ranger(present ~ ., 
                       data = d_train,
                       importance = "impurity")

# re-run model with permutation-based variable importance
mdl_permutation <- ranger(present ~ ., 
                          data = d_train,
                          importance = "permutation")

p1 <- vip::vip(mdl_impurity, bar = FALSE)
p2 <- vip::vip(mdl_permutation, bar = FALSE)

gridExtra::grid.arrange(p1, p2, nrow = 1)

3.4.3 Questions

Question 7: How might variable importance differ between rpart and RandomForest in your model outputs?